## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(data_frame)
## # A tibble: 6 × 20
## ...1 `Country Name` Year Agriculture, value added (…¹ CO2 emissions (metri…²
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 0 Afghanistan 1962 NA 0.0738
## 2 1 Afghanistan 1967 NA 0.124
## 3 2 Afghanistan 1972 NA 0.131
## 4 3 Afghanistan 1977 NA 0.183
## 5 4 Afghanistan 1982 NA 0.166
## 6 5 Afghanistan 1987 NA 0.276
## # ℹ abbreviated names: ¹`Agriculture, value added (% of GDP)`,
## # ²`CO2 emissions (metric tons per capita)`
## # ℹ 15 more variables:
## # `Domestic credit provided by financial sector (% of GDP)` <dbl>,
## # `Electric power consumption (kWh per capita)` <dbl>,
## # `Energy use (kg of oil equivalent per capita)` <dbl>,
## # `Exports of goods and services (% of GDP)` <dbl>, …
filtered_data = data_frame |>
filter(Year == 1962)
filtered_data |>
select(starts_with('CO2'), gdpPercap) |>
rename(co2 = starts_with('CO2')) |>
filter(if_all(everything(), ~ !is.na(.))) |>
ggplot() +
geom_point(aes(x = gdpPercap, y = co2)) +
labs(
x = 'GDP per capita',
y = 'CO2 emissions (metric tons per capita)',
title = 'CO2 emissions per capita generally increases with GDP'
)
correlation_columns = filtered_data |>
select(starts_with('CO2'), gdpPercap) |>
rename(co2 = starts_with('CO2')) |>
filter(if_all(everything(), ~ !is.na(.)))
#since the tow variables are quantitative continuous, i used Pearson correlation
#by default the cor() function will use Pearson method
#the function output a value of 0.9260817 indicating there is a positive relation between the two variables (evolving in the same direction)
cor(correlation_columns$co2,
correlation_columns$gdpPercap,
method = 'pearson')
## [1] 0.9260817
#the p_vlaue is used to assess the probability of getting a correlation coefficient as extreme as r, if we sample from a population when the null hypothesis r = 0 is true
cor.test(correlation_columns$co2,
correlation_columns$gdpPercap,
method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: correlation_columns$co2 and correlation_columns$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
#4. On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?”
data_frame |>
select(
Year,
co2 = starts_with('CO2'),
gdpPercap
) |>
filter(!is.na(co2), !is.na(gdpPercap)) |>
group_by(Year) |>
summarise(
r = cor(co2, gdpPercap),
.groups = 'drop'
) |>
ggplot() +
geom_bar(aes(x = Year, y = r),
stat = 'identity') +
geom_text(aes(x = Year, y = r, label = round(r, 2)),
vjust = -0.5) +
scale_x_continuous(breaks = sort(unique(data_frame$Year))) +
theme(legend.position = 'none')
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot = data_frame |>
filter(Year == 1967) |>
select( co2 = starts_with('CO2'),
gdpPercap,
pop,
continent) |>
ggplot() +
geom_point(aes(x = gdpPercap, y = co2, size = pop, color = continent)) +
scale_x_continuous(
labels = label_dollar(scale = 1/1000, suffix = 'K')
) +
coord_cartesian(
xlim = c(0,20000),
ylim = c(0,25)
) +
labs(
x = 'GDP per capita',
y = 'CO2 emissions (metric tons per capita)'
)
ggplotly(plot)
### box plot for energy use accross continent
### run a linear regression model
### run anova
fit = lm(`Energy use (kg of oil equivalent per capita)`~continent, data = data_frame)
anova(fit)
## Analysis of Variance Table
##
## Response: Energy use (kg of oil equivalent per capita)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 4 771482483 192870621 51.459 < 2.2e-16 ***
## Residuals 843 3159591816 3748033
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I used ANOVA to test whether there are significant differences in energy use across continents, under the assumption that the average energy use is equal for all continents.
Null Hypothesis H0: there isn’t a difference in energy use across continent
Alternative Hypothesis H1: at least one group mean energy use is different
the threshold for p-value is set to 0.05
The fourth column displays the p-value, which indicates the probability—under the null hypothesis—of observing an F-value as extreme as the one obtained. the p_value corresponds to 8.53 × 10⁻³, thus we reject the null hypothesis.
if we want to compare individual groups, we can use a contrast matrix.
##Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)
since we are comparing two independents groups. we will use t-test for comparsion of group means.
Null Hypothesis: There is no significant difference in imports as a percentage of GDP between Asia and Europe. Alternative Hypothesis: The mean import percentage of GDP differs between Asia and Europe.
euro_asia = data_frame |>
filter(Year > 1990) |>
filter(continent == 'Asia' | continent == 'Europe') |>
rename(import_goods = starts_with('Import')) |>
select(continent, import_goods) |>
filter(if_all(everything(), ~ !is.na(import_goods)))
t_test = t.test(import_goods ~ continent, data = euro_asia)$p.value
since (p > 0.05), we will accept the Null Hypothesis that there isn’t a significant difference between Asia and Europe in terms of Import of Goods and Services as a percentage of GDP.
##What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
data_frame |>
group_by(`Country Name`) |>
summarise(avg_pop_dens_yrs = as.numeric(mean(`Population density (people per sq. km of land area)`, na.rm = TRUE))) |>
arrange(desc(avg_pop_dens_yrs)) |>
head(10) |>
ggplot() +
geom_bar(aes(x = `Country Name`, y = avg_pop_dens_yrs),
stat = 'identity') +
geom_text(aes(x = `Country Name`, y = avg_pop_dens_yrs,
label = round(avg_pop_dens_yrs, 2)),
vjust = -0.25) +
coord_cartesian(ylim = c(0,17000))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
as we can observe in the bar chart, Macao region in China has the highest average population density over the years.
data_frame |>
group_by(`Country Name`) |>
filter(Year %in% c(1962:2007)) |>
summarise(
birth_exp_2007 = `Life expectancy at birth, total (years)`[Year == 2007],
birth_exp_1962 = `Life expectancy at birth, total (years)`[Year == 1962],
percent_change = ((birth_exp_2007 - birth_exp_1962) / birth_exp_1962) * 100
) |>
arrange(desc(percent_change)) |>
head(10) |>
ggplot() +
geom_bar(aes(x = `Country Name`, y = percent_change),
stat = 'identity') +
geom_text(aes(x = `Country Name`, y = percent_change,
label = round(percent_change, 2)),
vjust = -0.25) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
y = 'Average Life Expectency increase'
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Country Name'. You can override using the
## `.groups` argument.
we can see that the Bhutan has experienced the greatest increase in avergae life expectancy followed by Maldives.